16S-rDNA-V3-V4 repository: https://github.com/ycl6/16S-rDNA-V3-V4

cutadapt: GitHub, Documentation, Paper

DADA2: GitHub, Documentation, Paper

Demo Dataset: PRJEB27564 from Gut Microbiota in Parkinson’s Disease: Temporal Stability and Relations to Disease Progression. EBioMedicine. 2019;44:691-707

License: GPL-3.0


Primer trimming using cutadapt

You can use the included Perl script run_trimming.pl to perform primer trimming, where the raw fastq files are placed in the raw folder

cd /ngs/16S-Demo

# Usage: perl run_trimming.pl project_folder fastq_folder forward_primer_seq reverse_primer_seq
perl run_trimming.pl PRJEB27564 raw CCTACGGGNGGCWGCAG GACTACHVGGGTATCTAATCC

Or follow the instruction below to run cutadapt within R

Start R

cd /ngs/16S-Demo/PRJEB27564

R

Load package and set path

library("dada2")
library("data.table")
library("phyloseq")
library("ggplot2")

Change the fastq path to correspond to the location of the raw fastq files

fastq = "raw"           # raw fastq files
trimmed = "trimmed"     # cutadapt trimmed fastq files
filt = "filt"           # dada2 trimmed fastq files
outfiles = "outfiles"   # output files
images = "images"       # output images

if(!dir.exists(filt)) dir.create(filt)
if(!dir.exists(outfiles)) dir.create(outfiles)
if(!dir.exists(images)) dir.create(images)
head(list.files(fastq),15)
##  [1] "download.sh"           "ERR2730149_1.fastq.gz" "ERR2730149_2.fastq.gz"
##  [4] "ERR2730151_1.fastq.gz" "ERR2730151_2.fastq.gz" "ERR2730154_1.fastq.gz"
##  [7] "ERR2730154_2.fastq.gz" "ERR2730155_1.fastq.gz" "ERR2730155_2.fastq.gz"
## [10] "ERR2730158_1.fastq.gz" "ERR2730158_2.fastq.gz" "ERR2730162_1.fastq.gz"
## [13] "ERR2730162_2.fastq.gz" "ERR2730163_1.fastq.gz" "ERR2730163_2.fastq.gz"

*Primer trimming in R

The trimming program cutadapt is called using system2 function to perform trimming.

Note: Skip this step if trimmed has been performed

fns = sort(list.files(fastq, full.names = TRUE))
fnFs = fns[grep("1.fastq.gz", fns)]
fnRs = fns[grep("2.fastq.gz", fns)]

if(!dir.exists(trimmed)) dir.create(trimmed)

fnFs.cut = file.path(trimmed, basename(fnFs))
fnRs.cut = file.path(trimmed, basename(fnRs))
log.cut = gsub(".1.fastq.gz", ".log", fnFs.cut)
sample.names = gsub(".1.fastq.gz", "", basename(fnFs.cut))

# Define the primer set used to perform PCR
FWD = "CCTACGGGNGGCWGCAG"       # 341F
REV = "GACTACHVGGGTATCTAATCC"   # 785R

# Get reverse complement DNA sequences
FWD.RC = dada2::rc(FWD)
REV.RC = dada2::rc(REV)

# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags = paste("-g", FWD, "-a", REV.RC)
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags = paste("-G", REV, "-A", FWD.RC)

# Run cutadapt to remove primers
# Note to change the PATH to cutadapt accordingly
cutadapt = "/path-to-bin/cutadapt"

for(i in seq_along(fnFs)) {
    print(paste0("[", i ,"/", length(sample.names), "] ", sample.names[i]))
    
    system2(cutadapt,
        stdout = log.cut[i], stderr = log.cut[i], # log file
        args = c(R1.flags, R2.flags,
        "-n 2",                   # -n 2 required to remove FWD and REV from reads
        "--match-read-wildcards", # enable IUPAC nucleotide codes (wildcard characters)
        "--length 300",           # Truncate reads to 300 bp
        "-m 150",                 # discard reads shorter than LEN (avoid length zero sequences)
        "--overlap 10",           # min overlap between read and adapter for an adapter to be found
        "-j 0",                   # auto-detection of CPU cores, only available on Python 3
        "-o", fnFs.cut[i], "-p", fnRs.cut[i], # trimmed files
        fnFs[i], fnRs[i])         # input files
    )
}
head(list.files(trimmed),15)
##  [1] "ERR2730149_1.fastq.gz" "ERR2730149_2.fastq.gz" "ERR2730149.log"       
##  [4] "ERR2730151_1.fastq.gz" "ERR2730151_2.fastq.gz" "ERR2730151.log"       
##  [7] "ERR2730154_1.fastq.gz" "ERR2730154_2.fastq.gz" "ERR2730154.log"       
## [10] "ERR2730155_1.fastq.gz" "ERR2730155_2.fastq.gz" "ERR2730155.log"       
## [13] "ERR2730158_1.fastq.gz" "ERR2730158_2.fastq.gz" "ERR2730158.log"

Inspect read quality profiles

We use the plotQualityProfile function to plot the quality profiles of the trimmed fastq files

fns = sort(list.files(trimmed, full.names = TRUE))
fnFs = fns[grep("1.fastq.gz", fns)]
fnRs = fns[grep("2.fastq.gz", fns)]
sample.names = gsub(".1.fastq.gz", "", basename(fnFs))

# Plot quality profile of fastq files
ii = 1:length(sample.names)
pdf(paste0(images, "/plotQualityProfile.pdf"), width = 8, height = 8, pointsize = 12)
for(i in ii) {
    message(paste0("[", i ,"/", length(sample.names), "] ", sample.names[i]))
    print(plotQualityProfile(fnFs[i]) + ggtitle("Fwd"))
    print(plotQualityProfile(fnRs[i]) + ggtitle("Rev"))
}
invisible(dev.off())

We review the quality profiles, the quality distribution of forward reads appears to drop after position 260 and position 200 for reverse reads. We will use these positions to truncate forward and reverse reads respectively in the next step.

Filter and trim

Remember to review plotQualityProfile.pdf to select the best paramters for truncLen argument

# Set paths to the dada2-filterd files
filtFs = file.path(filt, basename(fnFs))
filtRs = file.path(filt, basename(fnRs))

# Perform filtering and trimming
out = filterAndTrim(fnFs, filtFs, fnRs, filtRs,
        # Need to keep paramters consistent between runs of the same study
        truncLen = c(260,200), minLen = 200, maxN = 0, truncQ = 2, maxEE = c(2,5),
        rm.phix = TRUE, compress = TRUE, verbose = TRUE, multithread = TRUE)

out = as.data.frame(out)
rownames(out) = sample.names
head(out, 10)
##            reads.in reads.out
## ERR2730149    65747     58829
## ERR2730151    70743     61547
## ERR2730154    91558     81374
## ERR2730155    58955     53269
## ERR2730158   128246    116847
## ERR2730162   134565    121999
## ERR2730163   116621    105093
## ERR2730165    57401     51513
## ERR2730166   121037    108838
## ERR2730170    57359     51276

Learn the Error Rates

Use the learnErrors function to perform dereplication and learn the error rates. The derepFastq function used in past workflow has been intergrated into learnErrors function

errF = learnErrors(filtFs, multithread = TRUE)
## 128404900 total bases in 493865 reads from 6 samples will be used for learning the error rates.
errR = learnErrors(filtRs, multithread = TRUE)
## 119791600 total bases in 598958 reads from 7 samples will be used for learning the error rates.
# Visualize the estimated error rates
pdf(paste0(images, "/plotErrors.pdf"), width = 10, height = 10, pointsize = 12)
plotErrors(errF, nominalQ = TRUE)
plotErrors(errR, nominalQ = TRUE)
invisible(dev.off())

Forward reads

Reverse reads

Sample Inference

We now apply the core sample inference algorithm to the filtered and trimmed sequence data

Note: By default, the dada function processes each sample independently (i.e. pool = FALSE), if your samples are from an extremely diverse community (e.g. soil), pooling (i.e. pool = TRUE) or pseudo-pooling (recommended; i.e. pool = pseudo) might help in identifying the rare ASVs in each sample

dadaFs = dada(filtFs, err = errF, pool = FALSE, multithread = TRUE)
dadaRs = dada(filtRs, err = errR, pool = FALSE, multithread = TRUE)

Merge paired reads

We now merge the forward and reverse reads together to obtain the full denoised sequences

mergers = mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose = TRUE)

Construct sequence table

We now construct an amplicon sequence variant (ASV) table

seqtab = makeSequenceTable(mergers)

View the length frequency distribution with table

table(nchar(getSequences(seqtab)))
## 
##   260   261   264   267   270   272   273   275   277   278   279   281   282   284   286 
##   460     7     1     1     7     3     7     2    10     7    11    14     1     1     1 
##   288   300   301   313   317   320   325   335   341   365   384   385   386   400   401 
##     1     2     1     1     2     1     2     1     1     1     4     2     3     2   724 
##   402   403   404   405   406   407   408   409   410   411   412   413   417   418   419 
## 11291  2583  4588  4221   126   257   112   124    34   264    13     1     2     8   101 
##   420   421   422   423   424   425   426   427   428   429   430   431   432   433   438 
##    67   531  3603   325    82    68   206  1082   452     7    12     1     8     3     1 
##   439   440   441   442   443   444   445   446   447   448 
##    24     7    13    10     9    10     4     6     5     7

Save sequence table

saveRDS(seqtab, "seqtab.rds")

# Or as an example, save only the first 5 samples
# saveRDS(seqtab[c(1:5),], "seqtab.rds")

Remove chimeras

seqtab.nochim = removeBimeraDenovo(seqtab, method = "consensus", multithread = TRUE, 
                   verbose = TRUE)
## Identified 23837 bimeras out of 31549 input sequences.
rownames(seqtab.nochim) = sample.names

View the dimensions of seqtab and seqtab.nochim

dim(seqtab)
## [1]   126 31549
dim(seqtab.nochim)
## [1]  126 7712

Print the proportion of non-chimeras in merged sequence reads

sum(seqtab.nochim)/sum(seqtab)
## [1] 0.9684075

Track reads through the pipeline

getN <- function(x) sum(getUniques(x))

track = cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), 
          sapply(mergers, getN), rowSums(seqtab.nochim))
colnames(track) = c("Trimmed", "Filtered", "denoisedF", "denoisedR", "merged", "nonchim")
track = cbind(data.frame(SampleID = sample.names), track)

write.table(track, file = paste0(outfiles, "/track.txt"), sep = "\t", quote = F, 
        row.names = F, col.names = T)

Assign taxonomy

Set the full-path to the Silva and NCBI database

dbpath = "/path-to-db/"
ref1 = paste0(dbpath, "silva_nr_v138_train_set.fa.gz")
ref2 = paste0(dbpath, "silva_species_assignment_v138.fa.gz")
ref3 = paste0(dbpath, "16SMicrobial.fa.gz")

Use the assignTaxonomy function to classifies sequences against SILVA reference training dataset ref1, and use the assignSpecies function to perform taxonomic assignment to the species level by exact matching against SILVA ref2 and NCBI reference datasets ref3

taxtab = assignTaxonomy(seqtab.nochim, refFasta = ref1, minBoot = 80, 
            tryRC = TRUE, outputBootstraps = TRUE, verbose = TRUE, multithread = TRUE)
## Finished processing reference fasta.
spec_silva = assignSpecies(getSequences(seqtab.nochim), ref2, allowMultiple = FALSE, 
               tryRC = TRUE, verbose = TRUE)
## 480 out of 7712 were assigned to the species level.
spec_ncbi = assignSpecies(getSequences(seqtab.nochim), ref3, allowMultiple = FALSE, 
              tryRC = TRUE, verbose = TRUE)
## 351 out of 7712 were assigned to the species level.

Combine species-level taxonomic assignment from 2 reference sources

SVformat = paste("%0",nchar(as.character(ncol(seqtab.nochim))),"d", sep = "")
svid = paste0("SV", sprintf(SVformat, seq(ncol(seqtab.nochim))))

s_silva = as.data.frame(spec_silva, stringsAsFactors = FALSE)
rownames(s_silva) = svid

s_ncbi = as.data.frame(spec_ncbi, stringsAsFactors = FALSE)
rownames(s_ncbi) = svid
s_ncbi$Genus = gsub("\\[|\\]", "", s_ncbi$Genus)

s_merged = cbind(s_ncbi, s_silva)
colnames(s_merged) = c("nGenus","nSpecies","sGenus","sSpecies")
s_merged1 = s_merged[!is.na(s_merged$nSpecies),]
colnames(s_merged1)[1:2] = c("Genus","Species")
s_merged2 = s_merged[is.na(s_merged$nSpecies) & !is.na(s_merged$sSpecies),]
colnames(s_merged2)[3:4] = c("Genus","Species")
s_merged3 = s_merged[is.na(s_merged$nSpecies) & is.na(s_merged$sSpecies),]
colnames(s_merged3)[3:4] = c("Genus","Species")

s_final = rbind(s_merged1[,c("Genus","Species")], s_merged2[,c("Genus","Species")], 
        s_merged3[,c("Genus","Species")])
s_final = s_final[order(row.names(s_final)),]

s_final = as.matrix(s_final)

if("Genus" %in% colnames(taxtab$tax)) {
    gcol = which(colnames(taxtab$tax) == "Genus")
} else {
    gcol = ncol(taxtab$tax)
}

matchGenera <- function(gen.tax, gen.binom, split.glyph = "/") {
    if(is.na(gen.tax) || is.na(gen.binom)) { return(FALSE) }
    if((gen.tax == gen.binom) || 
       grepl(paste0("^", gen.binom, "[ _", split.glyph, "]"), gen.tax) || 
       grepl(paste0(split.glyph, gen.binom, "$"), gen.tax)) {
        return(TRUE)
    } else {
        return(FALSE)
    }
}

gen.match = mapply(matchGenera, taxtab$tax[,gcol], s_final[,1])
taxtab$tax = cbind(taxtab$tax, s_final[,2])
colnames(taxtab$tax)[ncol(taxtab$tax)] = "Species"

print(paste(sum(!is.na(s_final[,2])), "out of", 
        nrow(s_final), "were assigned to the species level."))
## [1] "597 out of 7712 were assigned to the species level."
taxtab$tax[!gen.match,"Species"] = NA
print(paste("Of which", sum(!is.na(taxtab$tax[,"Species"])), 
        "had genera consistent with the input table."))
## [1] "Of which 499 had genera consistent with the input table."

Multiple sequence alignment

Prepare a data.frame df from seqtab.nochim and taxtab

df = data.frame(sequence = colnames(seqtab.nochim), abundance = colSums(seqtab.nochim), 
        stringsAsFactors = FALSE)
df$id = svid
df = merge(df, as.data.frame(taxtab), by = "row.names")
rownames(df) = df$id
df = df[order(df$id),2:ncol(df)]

Performs alignment of multiple unaligned sequences

alignment = DECIPHER::AlignSeqs(Biostrings::DNAStringSet(setNames(df$sequence, df$id)), anchor = NA)
## Determining distance matrix based on shared 8-mers:
## ==========================================================================================
## 
## Time difference of 541.23 secs
## 
## Clustering into groups by similarity:
## ==========================================================================================
## 
## Time difference of 9.04 secs
## 
## Aligning Sequences:
## ==========================================================================================
## 
## Time difference of 86.35 secs
## 
## Iteration 1 of 2:
## 
## Determining distance matrix based on alignment:
## ==========================================================================================
## 
## Time difference of 68.35 secs
## 
## Reclustering into groups by similarity:
## ==========================================================================================
## 
## Time difference of 6.75 secs
## 
## Realigning Sequences:
## ==========================================================================================
## 
## Time difference of 61.35 secs
## 
## Iteration 2 of 2:
## 
## Determining distance matrix based on alignment:
## ==========================================================================================
## 
## Time difference of 59.98 secs
## 
## Reclustering into groups by similarity:
## ==========================================================================================
## 
## Time difference of 6.61 secs
## 
## Realigning Sequences:
## ==========================================================================================
## 
## Time difference of 14.64 secs
## 
## Refining the alignment:
## ==========================================================================================
## 
## Time difference of 0.91 secs

Export alignment

phang.align = phangorn::phyDat(as(alignment, "matrix"), type = "DNA")
phangorn::write.phyDat(phang.align, file = "alignment.fasta", format = "fasta")
phangorn::write.phyDat(phang.align, file = "alignment.aln", format = "phylip")

Construct phylogenetic tree

Set the full-path to the RAxML and RAxML-NG

raxml = "/path-to-raxml"
raxmlng = "/path-to-raxml-ng"

Run RAxML and RAxML-NG

system2(raxml, args = c("-T 2", "-f E", "-p 1234", "-x 5678", "-m GTRCAT", "-N 1", 
            "-s alignment.aln", "-n raxml_tree_GTRCAT"))

system2(raxmlng, args = c("--evaluate", "--force", "--seed 1234", "--log progress", "--threads 2", 
              "--msa alignment.fasta",  "--model GTR+G", "--brlen scaled", 
              "--tree RAxML_fastTree.raxml_tree_GTRCAT", "--prefix GTRCAT"))

Import tree using the read_tree function from phyloseq

raxml_tree = read_tree("GTRCAT.raxml.bestTree")

Load sample information

samdf = data.frame(fread("sample.meta", colClasses = "character"))

rownames(samdf) = samdf$Sample_ID
samdf$Sample_ID = as.factor(samdf$Sample_ID)
samdf$Sample_ID = factor(samdf$Sample_ID, levels = c(sort(levels(samdf$Sample_ID), decreasing = F)))
samdf$Subject = as.factor(samdf$Subject)
samdf$Group2 = paste(samdf$Group, samdf$Time, sep = "_")
samdf$Group = as.factor(samdf$Group)
samdf$Group2 = as.factor(samdf$Group2)
samdf$Time = as.factor(samdf$Time)

head(samdf)
##            Run_ID Sample_ID   Sample_title Subject   Group     Time           Group2
## C0005B ERR2730149    C0005B C0005 baseline   C0005 control baseline control_baseline
## C0009B ERR2730151    C0009B C0009 baseline   C0009 control baseline control_baseline
## C0019B ERR2730154    C0019B C0019 baseline   C0019 control baseline control_baseline
## C0020B ERR2730155    C0020B C0020 baseline   C0020 control baseline control_baseline
## C0024B ERR2730158    C0024B C0024 baseline   C0024 control baseline control_baseline
## C0032B ERR2730162    C0032B C0032 baseline   C0032 control baseline control_baseline

Handoff to phyloseq

Prepare new_seqtab and tax data.frames containing abundance and taxonomy information respectively

new_seqtab = seqtab.nochim
colnames(new_seqtab) = df[match(colnames(new_seqtab), df$sequence),]$id

# Update rownames in new_seqtab from Run_ID to Sample_ID
rownames(new_seqtab) = as.character(samdf[match(rownames(new_seqtab), samdf$Run_ID),]$Sample_ID)

new_taxtab = taxtab
rownames(new_taxtab$tax) = df[match(rownames(new_taxtab$tax), df$sequence),]$id

tax = as.data.frame(new_taxtab$tax)
tax$Family = as.character(tax$Family)
tax$Genus = as.character(tax$Genus)

Construct a phyloseq object

ps = phyloseq(tax_table(as.matrix(tax)), 
          sample_data(samdf), 
          otu_table(new_seqtab, taxa_are_rows = FALSE), 
          phy_tree(raxml_tree))

ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7712 taxa and 126 samples ]
## sample_data() Sample Data:       [ 126 samples by 7 sample variables ]
## tax_table()   Taxonomy Table:    [ 7712 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 7712 tips and 7710 internal nodes ]

Save current workspace

save.image(file = "1_dada2_tutorial.RData")

Session information

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/p27/lib/libopenblasp-r0.3.7.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C               LC_TIME=en_GB.UTF-8       
##  [4] LC_COLLATE=en_GB.UTF-8     LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.2     phyloseq_1.30.0   data.table_1.12.8 dada2_1.14.1     
## [5] Rcpp_1.0.5        knitr_1.29       
## 
## loaded via a namespace (and not attached):
##  [1] Biobase_2.46.0              splines_3.6.2               jsonlite_1.7.0             
##  [4] foreach_1.5.0               RcppParallel_5.0.2          stats4_3.6.2               
##  [7] latticeExtra_0.6-29         GenomeInfoDbData_1.2.2      Rsamtools_2.2.3            
## [10] yaml_2.2.1                  pillar_1.4.5                lattice_0.20-41            
## [13] glue_1.4.1                  digest_0.6.25               GenomicRanges_1.38.0       
## [16] RColorBrewer_1.1-2          XVector_0.26.0              colorspace_1.4-1           
## [19] htmltools_0.5.0             Matrix_1.2-18               plyr_1.8.6                 
## [22] pkgconfig_2.0.3             ShortRead_1.44.3            zlibbioc_1.32.0            
## [25] purrr_0.3.4                 scales_1.1.1                jpeg_0.1-8.1               
## [28] BiocParallel_1.20.1         tibble_3.0.2                mgcv_1.8-31                
## [31] farver_2.0.3                generics_0.0.2              IRanges_2.20.2             
## [34] ellipsis_0.3.1              withr_2.2.0                 SummarizedExperiment_1.16.1
## [37] BiocGenerics_0.32.0         survival_3.2-3              magrittr_1.5               
## [40] crayon_1.3.4                evaluate_0.14               nlme_3.1-148               
## [43] MASS_7.3-51.6               hwriter_1.3.2               vegan_2.5-6                
## [46] tools_3.6.2                 lifecycle_0.2.0             matrixStats_0.56.0         
## [49] stringr_1.4.0               Rhdf5lib_1.8.0              S4Vectors_0.24.4           
## [52] munsell_0.5.0               cluster_2.1.0               DelayedArray_0.12.3        
## [55] Biostrings_2.54.0           ade4_1.7-15                 compiler_3.6.2             
## [58] GenomeInfoDb_1.22.1         rlang_0.4.6                 rhdf5_2.30.1               
## [61] grid_3.6.2                  RCurl_1.98-1.2              iterators_1.0.12           
## [64] biomformat_1.14.0           igraph_1.2.5                labeling_0.3               
## [67] bitops_1.0-6                rmarkdown_2.3               multtest_2.42.0            
## [70] gtable_0.3.0                codetools_0.2-16            reshape2_1.4.4             
## [73] R6_2.4.1                    gridExtra_2.3               GenomicAlignments_1.22.1   
## [76] dplyr_1.0.0                 permute_0.9-5               ape_5.4                    
## [79] stringi_1.4.6               parallel_3.6.2              vctrs_0.3.1                
## [82] png_0.1-7                   tidyselect_1.1.0            xfun_0.15